{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3.2 线性回归模型评估"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3.2.1 模型评估的编程实现"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "3.1.3 代码汇总 ：不同行业工作年限与收入的线性回归模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "系数a为:2497.1513476046866\n",
      "截距b为:10143.131966873787\n"
     ]
    }
   ],
   "source": [
    "# 1.读取数据\n",
    "import pandas\n",
    "df = pandas.read_excel('IT行业收入表.xlsx')\n",
    "X = df[['工龄']]\n",
    "Y = df['薪水']\n",
    "\n",
    "# 2.模型训练\n",
    "from sklearn.linear_model import LinearRegression\n",
    "regr = LinearRegression()\n",
    "regr.fit(X,Y)\n",
    "\n",
    "# 3.模型可视化\n",
    "from matplotlib import pyplot as plt\n",
    "plt.scatter(X,Y)\n",
    "plt.plot(X, regr.predict(X), color='red')  # color='red'设置为红色\n",
    "plt.xlabel('工龄')\n",
    "plt.ylabel('薪水')\n",
    "plt.show()\n",
    "\n",
    "# 4.线性回归方程构造\n",
    "print('系数a为:' + str(regr.coef_[0]))\n",
    "print('截距b为:' + str(regr.intercept_))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Anaconda\\Anaconda\\lib\\site-packages\\numpy\\core\\fromnumeric.py:2495: FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.\n",
      "  return ptp(axis=axis, out=out, **kwargs)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>           <td>薪水</td>        <th>  R-squared:         </th> <td>   0.855</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.854</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   578.5</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>             <td>Wed, 01 Jan 2020</td> <th>  Prob (F-statistic):</th> <td>6.69e-43</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                 <td>19:43:12</td>     <th>  Log-Likelihood:    </th> <td> -930.83</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>      <td>   100</td>      <th>  AIC:               </th> <td>   1866.</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>          <td>    98</td>      <th>  BIC:               </th> <td>   1871.</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>              <td>     1</td>      <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "    <td></td>       <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>const</th> <td> 1.014e+04</td> <td>  507.633</td> <td>   19.981</td> <td> 0.000</td> <td> 9135.751</td> <td> 1.12e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>工龄</th>    <td> 2497.1513</td> <td>  103.823</td> <td>   24.052</td> <td> 0.000</td> <td> 2291.118</td> <td> 2703.185</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td> 0.287</td> <th>  Durbin-Watson:     </th> <td>   0.555</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th> <td> 0.867</td> <th>  Jarque-Bera (JB):  </th> <td>   0.463</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>          <td> 0.007</td> <th>  Prob(JB):          </th> <td>   0.793</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>      <td> 2.667</td> <th>  Cond. No.          </th> <td>    9.49</td>\n",
       "</tr>\n",
       "</table><br/><br/>Warnings:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified."
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                            OLS Regression Results                            \n",
       "==============================================================================\n",
       "Dep. Variable:                     薪水   R-squared:                       0.855\n",
       "Model:                            OLS   Adj. R-squared:                  0.854\n",
       "Method:                 Least Squares   F-statistic:                     578.5\n",
       "Date:                Wed, 01 Jan 2020   Prob (F-statistic):           6.69e-43\n",
       "Time:                        19:43:12   Log-Likelihood:                -930.83\n",
       "No. Observations:                 100   AIC:                             1866.\n",
       "Df Residuals:                      98   BIC:                             1871.\n",
       "Df Model:                           1                                         \n",
       "Covariance Type:            nonrobust                                         \n",
       "==============================================================================\n",
       "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "const       1.014e+04    507.633     19.981      0.000    9135.751    1.12e+04\n",
       "工龄          2497.1513    103.823     24.052      0.000    2291.118    2703.185\n",
       "==============================================================================\n",
       "Omnibus:                        0.287   Durbin-Watson:                   0.555\n",
       "Prob(Omnibus):                  0.867   Jarque-Bera (JB):                0.463\n",
       "Skew:                           0.007   Prob(JB):                        0.793\n",
       "Kurtosis:                       2.667   Cond. No.                         9.49\n",
       "==============================================================================\n",
       "\n",
       "Warnings:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "\"\"\""
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import statsmodels.api as sm\n",
    "X2 = sm.add_constant(X)\n",
    "est = sm.OLS(Y, X2).fit()\n",
    "est.summary()  # 在非Jupyter Notebook的编辑器中需要写成print(est.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 如果设置成一元二次方程，来看下模型评估效果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>           <td>薪水</td>        <th>  R-squared:         </th> <td>   0.931</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.930</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   654.8</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>             <td>Wed, 01 Jan 2020</td> <th>  Prob (F-statistic):</th> <td>4.70e-57</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                 <td>19:43:12</td>     <th>  Log-Likelihood:    </th> <td> -893.72</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>      <td>   100</td>      <th>  AIC:               </th> <td>   1793.</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>          <td>    97</td>      <th>  BIC:               </th> <td>   1801.</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>              <td>     2</td>      <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "    <td></td>       <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>const</th> <td> 1.399e+04</td> <td>  512.264</td> <td>   27.307</td> <td> 0.000</td> <td>  1.3e+04</td> <td>  1.5e+04</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x1</th>    <td> -743.6808</td> <td>  321.809</td> <td>   -2.311</td> <td> 0.023</td> <td>-1382.383</td> <td> -104.979</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>x2</th>    <td>  400.8040</td> <td>   38.790</td> <td>   10.333</td> <td> 0.000</td> <td>  323.816</td> <td>  477.792</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td> 2.440</td> <th>  Durbin-Watson:     </th> <td>   1.137</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th> <td> 0.295</td> <th>  Jarque-Bera (JB):  </th> <td>   2.083</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>          <td>-0.352</td> <th>  Prob(JB):          </th> <td>   0.353</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>      <td> 3.063</td> <th>  Cond. No.          </th> <td>    102.</td>\n",
       "</tr>\n",
       "</table><br/><br/>Warnings:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified."
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                            OLS Regression Results                            \n",
       "==============================================================================\n",
       "Dep. Variable:                     薪水   R-squared:                       0.931\n",
       "Model:                            OLS   Adj. R-squared:                  0.930\n",
       "Method:                 Least Squares   F-statistic:                     654.8\n",
       "Date:                Wed, 01 Jan 2020   Prob (F-statistic):           4.70e-57\n",
       "Time:                        19:43:12   Log-Likelihood:                -893.72\n",
       "No. Observations:                 100   AIC:                             1793.\n",
       "Df Residuals:                      97   BIC:                             1801.\n",
       "Df Model:                           2                                         \n",
       "Covariance Type:            nonrobust                                         \n",
       "==============================================================================\n",
       "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "const       1.399e+04    512.264     27.307      0.000     1.3e+04     1.5e+04\n",
       "x1          -743.6808    321.809     -2.311      0.023   -1382.383    -104.979\n",
       "x2           400.8040     38.790     10.333      0.000     323.816     477.792\n",
       "==============================================================================\n",
       "Omnibus:                        2.440   Durbin-Watson:                   1.137\n",
       "Prob(Omnibus):                  0.295   Jarque-Bera (JB):                2.083\n",
       "Skew:                          -0.352   Prob(JB):                        0.353\n",
       "Kurtosis:                       3.063   Cond. No.                         102.\n",
       "==============================================================================\n",
       "\n",
       "Warnings:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "\"\"\""
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "poly_reg = PolynomialFeatures(degree=2)\n",
    "X_ = poly_reg.fit_transform(X)\n",
    "\n",
    "import statsmodels.api as sm\n",
    "X2 = sm.add_constant(X_)  # 这里传入的是含有x^2的X_\n",
    "est = sm.OLS(Y, X2).fit()\n",
    "est.summary()  # 在非Jupyter Notebook的编辑器中需要写成print(est.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "可以看到模型效果的确有所提升"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**补充知识点：另一种获取R-squared值的代码实现**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 1.读取数据\n",
    "import pandas\n",
    "df = pandas.read_excel('IT行业收入表.xlsx')\n",
    "X = df[['工龄']]\n",
    "Y = df['薪水']\n",
    "\n",
    "# 2.模型训练\n",
    "from sklearn.linear_model import LinearRegression\n",
    "regr = LinearRegression()\n",
    "regr.fit(X,Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8551365584870814\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import r2_score\n",
    "r2 = r2_score(Y, regr.predict(X))\n",
    "print(r2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "可以看到和之前通过statsmodels库评估的结果是一致的。"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
